home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Collection of Tools & Utilities
/
Collection of Tools and Utilities.iso
/
tex
/
egrep.zip
/
EGREP.C
< prev
next >
Wrap
C/C++ Source or Header
|
1988-03-31
|
12KB
|
451 lines
/*
Boyer/Moore/Gosper-assisted 'egrep' search, with delta0 table as in
original paper (CACM, October, 1977). No delta1 or delta2. According to
experiment (Horspool, Soft. Prac. Exp., 1982), delta2 is of minimal
practical value. However, to improve for worst case input, integrating
the improved Galil strategies (Apostolico/Giancarlo, Siam. J. Comput.,
February 1986) deserves consideration.
Method: extract longest metacharacter-free string from expression.
this is done using a side-effect from henry spencer's regcomp().
use boyer-moore to match such, then pass submatching lines
to rexexp() for short input, or standard 'egrep' for long
input. (this tradeoff is due to the general slowness of the
regexp() nondeterministic machine on complex expressions,
as well as the startup time of 'egrep' on short files.)
alternatively, you may change the faster 'egrep' automaton
to include boyer-moore directly.
Future:
beef up for multiple patterns ala bm/fgrep. can do fast -n
via file rescan, but it's a luxury. adapt 'fastfind'.
internationalize for kanji.
James A. Woods Copyright (c) 1986
NASA Ames Research Center
*/
#ifdef V7
#define BSD
#define void int
#endif
#include <stdio.h>
#include <ctype.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <regexp.h> /* must be henry spencer's version */
#ifdef BSD
#define strchr index
#endif
#ifndef EGREP
#define EGREP "/bin/local/egrep" /* prevent installation-dependent recursion */
#endif
#define BUFSIZE 8192
#define PATSIZE 1000
#define LARGE BUFSIZE + PATSIZE
#define FSIZE 50000 /* algorithm tradeoff at this length (ad hoc) */
#define NL '\n'
#define EOS '\0'
extern char *optarg;
extern int optind;
int cflag, iflag, eflag, fflag, lflag;
int boyflag, rxflag;
int hflag;
int nfile, nsuccess;
long nmatch;
regexp *rspencer;
char *pattern, *patboy;
int delta0[256]; /* ascii only -- see note at gosper() */
char cmap[256]; /* (un)folded characters */
char str[BUFSIZE+2];
char linetemp[BUFSIZE];
char grepcmd[PATSIZE];
struct stat stbuf;
int fd;
FILE *egout, *mcilroy(), *popen();
char *strchr(), *strcpy(), *strncpy(), *strpbrk(), *malloc();
char *fold(), *sys5fold();
main ( argc, argv )
int argc; char *argv[];
{
int c;
int errflag = 0;
int oldegrep = 0;
while ( (c = getopt ( argc, argv, "bchie:f:lnv" ) ) != EOF )
switch(c) {
case 'f':
fflag++;
case 'b':
case 'n':
case 'v':
oldegrep++; /* boyer-moore of little help here */
continue;
case 'c':
cflag++;
continue;
case 'e':
eflag++;
pattern = optarg;
continue;
case 'h':
hflag++; /* shh ... for newshounds */
continue;
case 'i':
iflag++;
continue;
case 'l':
lflag++;
continue;
case '?':
errflag++;
}
argc -= optind;
if ( errflag || ((argc <= 0) && !fflag) )
oops ( "usage: egrep [-bcilnv] [-e exp] [-f file] [strings] [file]" );
if ( !eflag ) {
pattern = argv[optind++];
argc--;
}
if ( oldegrep || (strchr ( pattern, '\n' ) != NULL) ) {
execvp ( EGREP, argv );
oops ( "can't exec old 'egrep'" );
}
if ( iflag ) {
strcpy ( pattern, fold ( pattern ) );
}
if ( strpbrk ( pattern, "^$.[]()?+*|\\" ) == NULL ) { /* do boyer-moore only */
boyflag++;
rxflag = 0;
patboy = pattern;
}
else {
/*
first compile a fake regexp to isolate longest
metacharacter-free string
*/
{
char *dummyp;
dummyp = malloc ( (unsigned) strlen ( pattern ) + 5 );
sprintf ( dummyp, "(.)*%s", pattern );
rspencer = regcomp ( dummyp );
}
if ( rspencer -> regmust == NULL ) { /* pattern too complicated */
execvp ( EGREP, argv );
oops ( "can't exec old 'egrep'" );
}
patboy = rspencer -> regmust;
if ( (rspencer = regcomp ( pattern )) == NULL )
oops ( "egrep: regcomp failure" );
}
gosper ( patboy );
argv = &argv[optind];
nfile = argc;
if ( argc <= 0 ) { /* process stdin */
if ( lflag )
exit ( 1 );
fd = 0;
if ( !boyflag )
if ( (egout = mcilroy ( (char *) NULL )) == NULL )
oops ( "egrep: no processes" );
boyermoore ( (char *) NULL, patboy );
if ( !boyflag )
pclose ( egout );
}
else {
while ( --argc >= 0 ) {
if ( (fd = open ( *argv, 0 )) <= 0 ) {
fprintf ( stderr, "egrep: can't open %s\n", *argv );
nsuccess = 2;
argv++;
continue;
}
if ( !boyflag ) {
fstat ( fd, &stbuf );
if ( stbuf.st_size < FSIZE )
rxflag = 1;
else {
rxflag = 0;
if ( (egout = mcilroy ( *argv )) == NULL )
oops ( "egrep: no processes" );
}
}
boyermoore ( *argv, patboy );
if ( !boyflag && !rxflag ) {
fflush ( egout );
pclose ( egout );
}
close ( fd );
argv++;
}
}
exit ( (nsuccess == 2) ? 2 : (nsuccess == 0) );
}
boyermoore ( file, pattern ) /* "reach out and boyer-moore search someone" */
char *file, *pattern; /* -- soon-to-be-popular bumper sticker */
{
register char *k, *strend, *s;
register int j;
int patlen;
char *t;
char *gotamatch();
int nleftover, count;
patlen = strlen ( pattern );
nleftover = nmatch = 0;
#ifdef V7
#define read xread
#endif
while ( (count = read ( fd, str + nleftover, BUFSIZE - nleftover )) > 0 ) {
#undef read
count += nleftover;
if ( count != BUFSIZE && fd != 0)
str[count++] = NL; /* insurance for broken last line */
str[count] = 0;
for ( j = count - 1; str[j] != NL && j >= 0; )
j--;
if ( j < 0 ) { /* break up long line */
str[count] = NL;
str[++count] = EOS;
strend = str + count;
linetemp[0] = EOS;
nleftover = 0;
}
else { /* save partial line */
strend = str + j;
nleftover = count - j - 1;
strncpy ( linetemp, str + j + 1, nleftover );
}
k = str + patlen - 1;
for ( ; ; ) {
/*
for a large class of patterns, upwards of 80% of
match time is spent on the next line.
we beat existing microcode (vax 'matchc') this way.
*/
#ifndef V7
while ( (k += delta0[*(unsigned char *)k]) < strend )
;
#else
while ( (k += delta0[*(char *)k & 0377]) < strend )
;
#endif
if ( k < (str + LARGE) )
break;
k -= LARGE;
j = patlen - 1;
s = k - 1;
while ( cmap[*s--] == pattern[--j] )
;
/*
delta-less shortcut for literati, but
short shrift for genetic engineers.
*/
if ( j >= 0 )
k++;
else { /* submatch */
t = k;
s = k - patlen + 1;
do
;
while ( *s != NL && --s >= str );
k = s + 1; /* at line start */
if ( boyflag ) {
if ( (k = gotamatch ( file, k )) == NULL )
return;
}
else if ( !rxflag ) { /* fob off to egrep */
do
putc ( *k, egout );
while ( *k++ != NL );
}
else { /* regexec() wants EOS */
s = t;
do
;
while ( *s++ != NL );
*--s = EOS;
if ( regexec ( rspencer, ((iflag) ? fold ( k ) : k) ) == 1 ) {
*s = NL;
if ( gotamatch ( file, k ) == NULL )
return;
}
*s = NL;
k = s + 1;
}
if ( k >= strend )
break;
}
}
strncpy ( str, linetemp, nleftover );
}
if ( cflag && (boyflag || rxflag) ) {
if ( nfile > 1 )
printf ( "%s:", file );
printf ( "%ld\n", nmatch );
}
}
FILE *
mcilroy ( file ) /* open a pipe to old 'egrep' for long files, */
char *file; /* ... where rege